Download the dataset and save it to a directory at per your convience. IMDB comments
In [1]:
import pandas as pd # Used for dataframe functions
import json # parse json string
import nltk # Natural language toolkit for TDIDF etc.
from bs4 import BeautifulSoup # Parse html string .. to extract text
import re # Regex parser
import numpy as np # Linear algebbra
from sklearn import * # machine learning
import matplotlib.pyplot as plt # Visualization
# Wordcloud does not work on Windows.
# Comment the below if you want to skip
from wordcloud import WordCloud # Word cloud visualization
import scipy #Sparse matrix
np.set_printoptions(precision=4)
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 10
pd.options.display.float_format = lambda f: "%.4f" % f
%matplotlib inline
Run the following lines when you run this notebook first time on your system.
In [2]:
import nltk
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download('averaged_perceptron_tagger')
nltk.download("vader_lexicon")
Out[2]:
In [3]:
print(nltk.__version__)
Now let's see how to create text classifier using nltk and scikit learn.
In [4]:
# The following line does not work on Windows system
!head -n 1 /data/imdb-comments.json
In [5]:
data = []
with open("/data/imdb-comments.json", "r", encoding="utf8") as f:
for l in f.readlines():
data.append(json.loads(l))
In [6]:
comments = pd.DataFrame.from_dict(data)
comments.sample(10)
Out[6]:
In [7]:
comments.info()
In [8]:
comments.label.value_counts()
Out[8]:
In [9]:
comments.groupby(["label", "sentiment"]).content.count().unstack()
Out[9]:
In [10]:
np.random.seed(1)
v = list(comments["content"].sample(1))[0]
v
Out[10]:
In [11]:
comments.head()
Out[11]:
In [12]:
comments["content"].values[0]
Out[12]:
In [13]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
In [14]:
sia.polarity_scores(comments["content"].values[0])
Out[14]:
In [15]:
def sentiment_score(text):
return sia.polarity_scores(text)["compound"]
sentiment_score(comments["content"].values[0])
Out[15]:
In [16]:
%%time
comments["vader_score"] = comments["content"].apply(lambda text: sentiment_score(text))
In [17]:
comments["vader_sentiment"] = np.where(comments["vader_score"]>0, "pos", "neg")
In [18]:
comments.head()
Out[18]:
In [19]:
comments.vader_sentiment.value_counts()
Out[19]:
In [20]:
print(metrics.classification_report(comments["sentiment"], comments["vader_sentiment"]))
As we see above the accuracy is the range of 0.70. Vader model performed better for the positive sentiment compared to negative sentiment. Let's now use statistical model using TFIDF which generally perform better.
In [21]:
def preprocess(text):
# Remove html tags
text = BeautifulSoup(text.lower(), "html5lib").text
# Replace the occurrences of multiple consecutive non-word ccharacters
# with a single space (" ")
text = re.sub(r"[\W]+", " ", text)
return text
preprocess(v)
Out[21]:
In [22]:
%%time
# Apply the preprocessing logic to all comments
comments["content"] = comments["content"].apply(preprocess)
In [23]:
comments_train = comments[comments["label"] == "train"]
comments_train.sample(10)
Out[23]:
In [24]:
comments_test = comments[comments["label"] == "test"]
comments_test.sample(10)
Out[24]:
In [25]:
X_train = comments_train["content"].values
y_train = np.where(comments_train.sentiment == "pos", 1, 0)
In [26]:
X_test = comments_test["content"].values
y_test = np.where(comments_test.sentiment == "pos", 1, 0)
In [27]:
# http://snowball.tartarus.org/algorithms/porter/stemmer.html
# http://www.nltk.org/howto/stem.html
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
print(SnowballStemmer.languages)
In [28]:
porter = PorterStemmer()
snowball = SnowballStemmer("english")
lemmatizer = nltk.wordnet.WordNetLemmatizer()
values = []
for s in nltk.word_tokenize("""
revival
allowance
inference
relational
runner
runs
ran
has
having
generously
wasn't
leaves
swimming
relative
relating
"""):
values.append((s, porter.stem(s)
, snowball.stem(s), lemmatizer.lemmatize(s, "v")))
pd.DataFrame(values, columns = ["original", "porter", "snowball", "lemmatizer"])
Out[28]:
In [29]:
stopwords = nltk.corpus.stopwords.words("english")
print(len(stopwords), stopwords)
Lets drop the following words from stopwords since they are likely good indicator of sentiment.
In [30]:
stopwords.remove("no")
stopwords.remove("nor")
stopwords.remove("not")
In [31]:
sentence = """Financial Services revenues increased $0.5 billion, or 5%, primarily due to
lower impairments and volume growth, partially offset by lower gains."""
stemmer = SnowballStemmer("english")
#stemmer = PorterStemmer()
def my_tokenizer(s):
terms = nltk.word_tokenize(s.lower())
#terms = re.split("\s", s.lower())
#terms = [re.sub(r"[\.!]", "", v) for v in terms if len(v)>2]
#terms = [v for v in terms if len(v)>2]
terms = [v for v in terms if v not in stopwords]
terms = [stemmer.stem(w) for w in terms]
#terms = [term for term in terms if len(term) > 2]
return terms
print(my_tokenizer(sentence))
In [32]:
tfidf = feature_extraction.text.TfidfVectorizer(tokenizer=my_tokenizer, max_df = 0.95, min_df=0.0001
, ngram_range=(1, 2))
corpus = ["Today is Wednesday"
, "Delhi weather is hot today."
, "Delhi roads are not busy in the morning"]
doc_term_matrix = tfidf.fit_transform(corpus)
# returns term and index in the feature matrix
print("Vocabulary: ", tfidf.vocabulary_)
In [33]:
columns = [None] * len(tfidf.vocabulary_)
for term in tfidf.vocabulary_:
columns[tfidf.vocabulary_[term]] = term
columns
scores = pd.DataFrame(doc_term_matrix.toarray()
, columns= columns)
scores
Out[33]:
In [34]:
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
In [35]:
X_test_tfidf.shape, y_test.shape, X_train_tfidf.shape, y_train.shape
Out[35]:
Let's estimate the memory requirment if the data is presented in dense matrix format
In [36]:
cell_count = np.product(X_train_tfidf.shape)
bytes = cell_count * 4
GBs = bytes / (1024 ** 3)
GBs
Out[36]:
In [37]:
sparsity = 1 - X_train_tfidf.count_nonzero() / cell_count
sparsity
Out[37]:
In [38]:
1 - X_train_tfidf.nnz / cell_count
Out[38]:
In [39]:
print("Type of doc_term_matrix", type(X_train_tfidf))
Byte size of the training doc sparse doc
In [40]:
print(X_train_tfidf.data.nbytes / (1024.0 ** 3), "GB")
In [41]:
%%time
lr = linear_model.LogisticRegression(C = 0.6, random_state = 1
, n_jobs = 8, solver="saga")
lr.fit(X_train_tfidf, y_train)
y_train_pred = lr.predict(X_train_tfidf)
y_test_pred = lr.predict(X_test_tfidf)
print("Training accuracy: ", metrics.accuracy_score(y_train, y_train_pred))
print("Test accuracy: ", metrics.accuracy_score(y_test, y_test_pred))
In [ ]:
In [42]:
fpr, tpr, thresholds = metrics.roc_curve(y_test,
lr.predict_proba(X_test_tfidf)[:, [1]])
auc = metrics.auc(fpr, tpr)
plt.plot(fpr, tpr)
plt.ylim(0, 1)
plt.xlim(0, 1)
plt.plot([0,1], [0,1], ls = "--", color = "k")
plt.xlabel("False Postive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve, auc: %.4f" % auc);
In [43]:
%%time
from sklearn import naive_bayes, ensemble
bayes = naive_bayes.MultinomialNB(alpha=1)
bayes.fit(X_train_tfidf, y_train)
print("accuracy: ", bayes.score(X_test_tfidf, y_test))
In [44]:
%%time
est = tree.DecisionTreeClassifier()
est.fit(X_train_tfidf, y_train)
print("accuracy: ", est.score(X_test_tfidf, y_test))
In [45]:
columns = [None] * len(tfidf.vocabulary_)
for term in tfidf.vocabulary_:
columns[tfidf.vocabulary_[term]] = term
result = pd.DataFrame({"feature": columns
, "importance": est.feature_importances_})
result = result.sort_values("importance", ascending = False)
result = result[result.importance > 0.0]
print("Top 50 terms: ", list(result.feature[:50]))
In [46]:
vocab_by_term = tfidf.vocabulary_
vocab_by_idx = dict({(vocab_by_term[term], term)
for term in vocab_by_term})
In [47]:
str(vocab_by_term)[:100]
Out[47]:
In [48]:
str(vocab_by_idx)[:100]
Out[48]:
In [49]:
idx = 5
print("Content:\n", X_train[idx])
row = X_train_tfidf[idx]
terms = [(vocab_by_idx[row.indices[i]], row.data[i])
for i, term in enumerate(row.indices)]
pd.Series(dict(terms)).sort_values(ascending = False)
Out[49]:
In [50]:
idx = 50
row = X_train_tfidf[idx]
terms = [(vocab_by_idx[row.indices[i]], row.data[i])
for i, term in enumerate(row.indices)]
top_terms= list(pd.Series(dict(terms))\
.sort_values(ascending = False)[:50].index)
wc = WordCloud(background_color="white",
width=500, height=500, max_words=50).generate("+".join(top_terms))
plt.figure(figsize=(10, 10))
plt.imshow(wc)
plt.axis("off");
In [51]:
%%time
tfidf =feature_extraction.text.TfidfVectorizer(
tokenizer=my_tokenizer
, stop_words = stopwords
, ngram_range=(1, 2)
)
pipe = pipeline.Pipeline([
("tfidf", tfidf),
("est", linear_model.LogisticRegression(C = 1.0, random_state = 1
, n_jobs = 8, solver="saga"))
])
pipe.fit(X_train, y_train)
Out[51]:
In [52]:
import pickle
In [53]:
with open("/tmp/model.pkl", "wb") as f:
pickle.dump(pipe, f)
In [54]:
!ls -lh /tmp/model.pkl
In [55]:
with open("/tmp/model.pkl", "rb") as f:
model = pickle.load(f)
In [56]:
doc1 = """when we started watching this series on
cable i had no idea how addictive it would be
even when you hate a character you hold back because
they are so beautifully developed you can almost
understand why they react to frustration fear greed
or temptation the way they do it s almost as if the
viewer is experiencing one of christopher s learning
curves i can t understand why adriana would put up with
christopher s abuse of her verbally physically and
emotionally but i just have to read the newspaper to
see how many women can and do tolerate such behavior
carmella has a dream house endless supply of expensive
things but i m sure she would give it up for a loving
and faithful husband or maybe not that s why i watch
it doesn t matter how many times you watch an episode
you can find something you missed the first five times
we even watch episodes out of sequence watch season 1
on late night with commercials but all the language a e
with language censored reruns on the movie network whenever
they re on we re there we ve been totally spoiled now i also
love the malaprop s an albacore around my neck is my favorite of
johnny boy when these jewels have entered our family vocabulary
it is a sign that i should get a life i will when the series
ends and i have collected all the dvd s and put the collection
in my will"""
doc1 = preprocess(doc1)
In [57]:
model.predict_proba(np.array([doc1]))[:, 1]
Out[57]:
Convert a collection of text documents to a matrix of deterministic hash token (murmur3) occurrences
It turns a collection of text documents into a scipy.sparse matrix holding token occurrence counts (or binary occurrence information), possibly normalized as token frequencies if norm=’l1’ or projected on the euclidean unit sphere if norm=’l2’.
Advantages
Disadvantages
In [58]:
hashing_vectorizer = feature_extraction.text.HashingVectorizer(n_features=2 ** 3
, tokenizer=my_tokenizer, ngram_range=(1, 2))
corpus = ["Today is Wednesday"
, "Delhi weather is hot today."
, "Delhi roads are not busy in the morning"]
doc_term_matrix = hashing_vectorizer.fit_transform(corpus)
pd.DataFrame(doc_term_matrix.toarray()) # Each cell is normalized (l2) row-wise
Out[58]:
In [59]:
%%time
n_features = int(X_train_tfidf.shape[1] * 0.8)
hashing_vectorizer = feature_extraction.text.HashingVectorizer(n_features=n_features
, tokenizer=my_tokenizer, ngram_range=(1, 2))
X_train_hash = hashing_vectorizer.fit_transform(X_train)
X_test_hash = hashing_vectorizer.transform(X_test)
In [60]:
X_train_hash
Out[60]:
In [61]:
X_train_hash.shape, X_test_hash.shape
Out[61]:
In [62]:
print(X_train_hash.data.nbytes / (1024.0 ** 3), "GB")
In [63]:
%%time
lr = linear_model.LogisticRegression(C = 1.0, random_state = 1,
solver = "liblinear")
lr.fit(X_train_hash, y_train)
y_train_pred = lr.predict(X_train_hash)
y_test_pred = lr.predict(X_test_hash)
print("Training accuracy: ", metrics.accuracy_score(y_train, y_train_pred))
print("Test accuracy: ", metrics.accuracy_score(y_test, y_test_pred))
In [64]:
print(metrics.classification_report(y_test, y_test_pred))
In [ ]: